Validation Dataset from Fisch et al., 2018: https://doi.org/10.1016/j.joca.2018.07.012

Libraries and files loading

Extract DE genes from the supplementary files of the paper

library(readxl)
library(tidyverse)
library(ggvenn)
library(RColorBrewer)
library(DESeq2)
library(ggplot2)
library(limma)
library(heatmap3)
library(msigdbr)
library(clusterProfiler)
library(enrichplot)
library(writexl)


MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"

up_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=1)
dw_genes <- read_excel(paste0(MEDIAsave,"up_dw_genes_consensus.xlsx"), sheet=2)
# accessing all the sheets 
sheet = excel_sheets("~/MEDIA Project/NIHMS992829-supplement-Suppl_matl.xlsx")

# applying sheet names to dataframe names
dataframe = lapply(setNames(sheet, sheet), 
                    function(x) read_excel("~/MEDIA Project/NIHMS992829-supplement-Suppl_matl.xlsx", sheet=x,col_names = FALSE))

# attaching all dataframes together
data_frame = bind_rows(dataframe, .id="Sheet")

data_frame <-data_frame[15:12493, 1:5]
df <-data_frame[64:12479,c(1,3:5)]
df2 <-data_frame[1:63,1:4]
df2 <-df2[-c(1:4),]
dff <-bind_rows(df,df2)

colnames(dff) <-c("Sheet","gene","logFC","pv-adj")
dff$logFC <-as.numeric(dff$logFC)
dff$`pv-adj` <-as.numeric(dff$`pv-adj`)
dff <-dff[,-1]
dff <-dff[!duplicated(dff$gene),]
dff <-dff[complete.cases(dff),]

Save gene data from Validation

save(dff, file=paste0(MEDIAsave,"DEgenes_from_validation.RData"))
load(paste0(MEDIAsave,"DEgenes_from_validation.RData"))

Find out how many UP and DOWN genes are also present in the consensus signature

upInDf <-dff[dff$gene %in% up_genes$gene,]
setdiff(up_genes$gene,dff$gene) 
## [1] "ACKR2"  "R3HDML"
upInDf <-upInDf[order(upInDf$logFC, decreasing = TRUE),]
upInDf_sg <-upInDf[upInDf$`pv-adj` <= 0.05,]

dwInDf <-dff[dff$gene %in% dw_genes$gene,]
setdiff(dw_genes$gene,dff$gene) 
## character(0)
dwInDf <-dwInDf[order(dwInDf$logFC, decreasing = TRUE),]
dwInDf_sg <-dwInDf[dwInDf$`pv-adj` <= 0.05,]

Venn Plot of Consensus genes and DE Validation genes

up_v <-dff[dff$logFC >=1.5 & dff$`pv-adj` <= 0.05,] 
up_v <-up_v[complete.cases(up_v),]  #315

dw_v <-dff[dff$logFC <= -1.5 & dff$`pv-adj` <= 0.05,]  
dw_v <-dw_v[complete.cases(dw_v),] #320

DE_Validation <- c(up_v$gene,dw_v$gene)
consensus_sig <- c(up_genes$gene,dw_genes$gene)

length(intersect(dw_v$gene,dw_genes$gene))
## [1] 3
length(intersect(up_v$gene,up_genes$gene))
## [1] 28
dff2 <-dff[dff$gene %in% DE_Validation,]
dff2 <-dff2[complete.cases(dff2$gene),]

v <-list(DE_Validation=DE_Validation, consensus_sig=consensus_sig)
names(v) <-c("DE genes in Validation Dataset","Consensus signature")

ggvenn(v, show_elements = F,show_percentage = T, text_size =17, set_name_size = 10,
       stroke_size = 0.5, fill_color = brewer.pal(name="Set1",n=4))+
ggtitle("Consensus genes differentially expressed in Validation dataset",
          subtitle = "Validation DE: log2FC threshold = |1.5|, pv_adj <= 0.05")+
theme(plot.title = element_text(hjust = 0.5,vjust = 1, size = 25),
        plot.subtitle = element_text(hjust = 0.5 ,vjust = 1, size = 15))

Fisher test to assess significance for the intersection

load(paste0(MEDIAsave,"combined_full_list.RData"))
conting_table <-data.frame("sign_in_Val"=rep(NA,2),"NOTsign_in_Val"=rep(NA,2))
rownames(conting_table) <-c("sign_in_Cons","NOTsign_in_Cons")


conting_table["sign_in_Cons","sign_in_Val"] <-length(intersect(DE_Validation,consensus_sig)) #31
conting_table["sign_in_Cons","NOTsign_in_Val"] <-length(setdiff(consensus_sig,DE_Validation)) #13

comb_notSig <-combined[!combined$gene %in% consensus_sig,] #13120=13164-44
Val_notSig <-dff[!dff$gene %in% dff2$gene,] #11811=12446-635

Val_notCons <-dff2[!dff2$gene %in% consensus_sig,] 

conting_table["NOTsign_in_Cons","sign_in_Val"] <-nrow(Val_notCons) #604
conting_table["NOTsign_in_Cons","NOTsign_in_Val"]<-length(intersect(comb_notSig$gene,Val_notSig$gene)) #10241

conting_table
##                 sign_in_Val NOTsign_in_Val
## sign_in_Cons             31             13
## NOTsign_in_Cons         604          10241
fisher.test(conting_table) 
## 
##  Fisher's Exact Test for Count Data
## 
## data:  conting_table
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  20.40468 84.37084
## sample estimates:
## odds ratio 
##   40.38563
rm(comb_notSig,Val_notCons,Val_notSig)
write_xlsx(dff2, path=paste0(MEDIAsave,"DEValidationSelected.xlsx"))

png(file = paste0(MEDIAgraph,"Fig5a_venn.png"),  width = 14, height = 8, res=300, units = "in")
ggvenn(v, show_elements = F,show_percentage = T, text_size =14, set_name_size = 9,
       stroke_size = 0.5, fill_color = brewer.pal(name="Set1",n=4))+
ggtitle("Consensus genes differentially expressed in Validation dataset",
          subtitle = "Validation DE: log2FC threshold = |1.5|, pv_adj <= 0.05")+
theme(plot.title = element_text(hjust = 0.5,vjust = 1, size = 25),
        plot.subtitle = element_text(hjust = 0.5 ,vjust = 1, size = 15,margin = margin(0,0,15,0)))
dev.off()
rm(dff2)
norm4dataset <- as.data.frame(read_excel("~/MEDIA Project/GSE114007_raw_counts_4dataset.xlsx", sheet=1))  #Normal samples data
rownames(norm4dataset) <-norm4dataset$symbol
norm4dataset <-norm4dataset[,-1]

dim(norm4dataset) 
## [1] 23710    18
oa4dataset <- as.data.frame(read_excel("~/MEDIA Project/GSE114007_raw_counts_4dataset.xlsx", sheet=2))  #OA samples data
rownames(oa4dataset) <-oa4dataset$symbol
oa4dataset <-oa4dataset[,-1]

dim(oa4dataset)
## [1] 23710    20
norm4dataset <-norm4dataset[rownames(oa4dataset),]
newdataset <-cbind(norm4dataset,oa4dataset)
coldata <-data.frame(class=c(rep("N",18),rep("OA",20)))
coldata$class <-as.factor(coldata$class)
coldata$info <-colnames(newdataset)
coldata$condition <-"cart"
coldata[9:18,"condition"] <-"all"
coldata[19:28,"condition"] <-"cart"
coldata[29:38,"condition"] <-"all"
coldata$condition <-as.factor(coldata$condition)

rownames(coldata) <-coldata$info
newdataset <- newdataset[, rownames(coldata)]

save(newdataset, file=paste0(MEDIAsave,"datasetValidation_full.RData"))

dds <- DESeqDataSetFromMatrix(countData = newdataset,
                              colData = coldata,
                              design = ~condition + class)

keep <- rowSums(assay(dds)>=5) >= 19   #at least 50% of the patients
dds <- dds[keep,]

dds$class <- factor(dds$class, levels = c("N","OA"))
dds$class <- relevel(dds$class, ref = "N")

dds$condition <- factor(dds$condition, levels = c("all","cart"))

nrow(dds)  
## [1] 14643

Variance Stabilizing Trasformation and Principal component analysis

vsd <- vst(dds, blind=FALSE)

pcaData <- plotPCA(vsd, intgroup=c("class", "condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=class, shape=condition,label =  rownames(pcaData))) +
  geom_point(size=3,alpha=1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()+ggtitle("All samples batch control")+ theme_bw()

Batch correction

mm <- model.matrix(~class, colData(vsd))
assay(vsd)<-removeBatchEffect(assay(vsd), batch = vsd$condition,design=mm) 

pcaData <- plotPCA(vsd, intgroup=c("class", "condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, color=class, shape=condition,label =  rownames(pcaData))) +
  geom_point(size=3,alpha=1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()+ggtitle("All samples without batch")+ theme_bw()

HEATMAP of consensus signature on the Validation dataset (z-score), batch corrected

normdata <-assay(vsd)
normdata1 <-t(apply(normdata,1,scale))
colnames(normdata1) <-colnames(normdata)
normdata <-normdata1

signature <-c(up_genes$gene,dw_genes$gene)

setdiff(signature,rownames(normdata)) #""ACKR2"   "TMEM59L" "R3HDML"   UP gene missing
## [1] "ACKR2"   "TMEM59L" "R3HDML"
up2 <-up_genes$gene[up_genes$gene %in% rownames(normdata)]

normdata <-normdata[c(up2,dw_genes$gene),]
dim(normdata)
## [1] 41 38
coldataAnn <-coldata
coldataAnn$col2 <-"plum"
coldataAnn[coldataAnn$class=="OA","col2"] <-"cyan"

rowdata <-data.frame(genes=c(up2,dw_genes$gene))
rowdata$col3 <-c(rep("red",36),rep("blue",5))

heatmap3(normdata[rowdata$genes,rownames(coldataAnn)],
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(16,10),
         ColSideColors = coldataAnn$col2,
         ColSideLabs = "Class",
         RowSideColors = rowdata$col3,
         RowSideLabs = "DE genes integration",
         cexRow = 0.9, cexCol = 0.9)
legend(0.8,1,legend=c("OA","Normal"),
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.9,1,legend=c("UP","DOWN"),
       fill=c("red","blue"), bty = "n", title = "DE")

png(file = paste0(MEDIAgraph,"Fig5b_ValHeatmap.png"),  width = 15, height = 12.4, res=300, units = "in")
heatmap3(normdata[rowdata$genes,rownames(coldataAnn)],
         scale="none",
         col=colorRampPalette(c("royalblue", "white", "red1"))(1024),
         RowAxisColors = 1,
         balanceColor = TRUE,
         margins = c(16,10),
         ColSideColors = coldataAnn$col2,
         ColSideLabs = "Class",
         RowSideColors = rowdata$col3,
         RowSideLabs = "DE genes integration",
         cexRow = .8, cexCol = .9)
legend(0.8,1,legend=c("OA","Normal"),
       fill=c("cyan","plum"), bty = "n", title = "Class")
legend(0.9,1,legend=c("UP","DOWN"),
       fill=c("red","blue"), bty = "n", title = "DE")
dev.off()

Enrichment analysis of consensus signature on Validation Dataset: GSEA Running Plot

sig <-data.frame(gs_name="Consensus signature",gene_symbol=c(up_genes$gene,dw_genes$gene))
colnames(sig) <-c("gs_name","gene_symbol")
colnames(sig) <-c("term_id","gene_id")

gene <-dff$logFC
names(gene) <-dff$gene
gene <-sort(gene,decreasing = TRUE)

set.seed(2459)
gs <- clusterProfiler::GSEA(gene,TERM2GENE = sig,
                             minGSSize = 5,
                             eps = 0,
                             pAdjustMethod = "fdr",
                             pvalueCutoff =0.01)

gseaplot2(gs, title ="Running score GSEA plot - validation dataset", geneSetID = 1, pvalue_table = T) 

Session Info

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] writexl_1.4.2               enrichplot_1.21.0          
##  [3] clusterProfiler_4.4.4       msigdbr_7.5.1              
##  [5] heatmap3_1.1.9              limma_3.52.4               
##  [7] DESeq2_1.36.0               SummarizedExperiment_1.26.1
##  [9] Biobase_2.56.0              MatrixGenerics_1.8.1       
## [11] matrixStats_1.0.0           GenomicRanges_1.48.0       
## [13] GenomeInfoDb_1.32.4         IRanges_2.30.1             
## [15] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [17] RColorBrewer_1.1-3          ggvenn_0.1.10              
## [19] lubridate_1.9.2             forcats_1.0.0              
## [21] stringr_1.5.0               dplyr_1.1.3                
## [23] purrr_1.0.1                 readr_2.1.4                
## [25] tidyr_1.3.0                 tibble_3.2.1               
## [27] ggplot2_3.4.3               tidyverse_2.0.0            
## [29] readxl_1.4.3               
## 
## loaded via a namespace (and not attached):
##   [1] shadowtext_0.1.2       fastmatch_1.1-4        plyr_1.8.8            
##   [4] igraph_1.5.0           lazyeval_0.2.2         splines_4.2.1         
##   [7] BiocParallel_1.30.4    digest_0.6.32          yulab.utils_0.0.6     
##  [10] htmltools_0.5.6        GOSemSim_2.22.0        viridis_0.6.3         
##  [13] GO.db_3.15.0           fansi_1.0.4            magrittr_2.0.3        
##  [16] memoise_2.0.1          tzdb_0.4.0             fastcluster_1.2.3     
##  [19] Biostrings_2.64.1      annotate_1.74.0        graphlayouts_1.0.0    
##  [22] timechange_0.2.0       colorspace_2.1-0       blob_1.2.4            
##  [25] ggrepel_0.9.3          xfun_0.39              crayon_1.5.2          
##  [28] RCurl_1.98-1.12        jsonlite_1.8.7         scatterpie_0.2.1      
##  [31] genefilter_1.78.0      survival_3.5-5         ape_5.7-1             
##  [34] glue_1.6.2             polyclip_1.10-4        gtable_0.3.4          
##  [37] zlibbioc_1.42.0        XVector_0.36.0         DelayedArray_0.22.0   
##  [40] scales_1.2.1           DOSE_3.22.1            DBI_1.1.3             
##  [43] Rcpp_1.0.11            viridisLite_0.4.2      xtable_1.8-4          
##  [46] gridGraphics_0.5-1     tidytree_0.4.2         bit_4.0.5             
##  [49] httr_1.4.7             fgsea_1.22.0           pkgconfig_2.0.3       
##  [52] XML_3.99-0.14          farver_2.1.1           sass_0.4.7            
##  [55] locfit_1.5-9.8         utf8_1.2.3             ggplotify_0.1.0       
##  [58] tidyselect_1.2.0       labeling_0.4.3         rlang_1.1.1           
##  [61] reshape2_1.4.4         AnnotationDbi_1.58.0   munsell_0.5.0         
##  [64] cellranger_1.1.0       tools_4.2.1            cachem_1.0.8          
##  [67] downloader_0.4         cli_3.6.1              generics_0.1.3        
##  [70] RSQLite_2.3.1          evaluate_0.21          fastmap_1.1.1         
##  [73] yaml_2.3.7             ggtree_3.6.2           babelgene_22.9        
##  [76] knitr_1.43             bit64_4.0.5            tidygraph_1.2.3       
##  [79] KEGGREST_1.36.3        ggraph_2.1.0           nlme_3.1-162          
##  [82] aplot_0.1.10           DO.db_2.9              compiler_4.2.1        
##  [85] rstudioapi_0.15.0      png_0.1-8              treeio_1.20.2         
##  [88] tweenr_2.0.2           geneplotter_1.74.0     bslib_0.5.1           
##  [91] stringi_1.7.12         highr_0.10             lattice_0.21-8        
##  [94] Matrix_1.5-4.1         vctrs_0.6.3            pillar_1.9.0          
##  [97] lifecycle_1.0.3        jquerylib_0.1.4        data.table_1.14.8     
## [100] bitops_1.0-7           patchwork_1.1.3        qvalue_2.28.0         
## [103] R6_2.5.1               gridExtra_2.3          codetools_0.2-19      
## [106] MASS_7.3-58.3          withr_2.5.0            GenomeInfoDbData_1.2.8
## [109] parallel_4.2.1         hms_1.1.3              ggfun_0.1.0           
## [112] rmarkdown_2.24         ggforce_0.4.1